Data preparations

Construct data frame

load("XSTSF_production.RData")
source('functions.R')

# add human perceptual sandhi categories
label_sandhi <- read.csv('raw_data/sandhi_label.csv', 
                         na.strings = '') 
f0_all_pre_label <- f0_all_pre %>% 
  select(!sandhi_tone) %>% 
  left_join(label_sandhi[, c('ind_no', 'sandhi_tone', 'sandhi_tone_var', 'diortri')], 
            by = c('diortri', 'ind_no')) %>% 
  mutate(sandhi_tone_var = case_when(is.na(sandhi_tone_var) == TRUE ~ sandhi_tone,
                                     .default = sandhi_tone_var)) %>% 
  rename(normtime = time)

# get disyllabic citation data 
f0_di <- f0_all_pre_label %>% filter(diortri == 'di') 
f0_di_ct <- f0_di %>% filter(focus_condition == 'ct') %>% 
  # re-normalisation
  group_by(speaker) %>%
    mutate(f0ref = mean(f0, na.rm = T),
           norm_f0 = scale(log(f0))) %>% 
    ungroup()

# get individual datasets
f0_di_ct_lcmh <- f0_di_ct %>% filter(grepl("^[LM]", syntax_iniTone)) %>% 
  mutate(sandhi_tone = case_when(sandhi_tone == 'HLLM' ~ 'HMML',
                                 sandhi_tone == 'LLHL' ~ 'LLRF', 
                                 .default = sandhi_tone))
# check manual labels
unique(f0_di_ct_lcmh$sandhi_tone)
## [1] "HMML"    "MHHL"    "LLLM"    "LLRF"    "LMML"    NA        "MMMH"   
## [8] "HHHH"    "outlier"
f0_di_ct_lcmh_h <- f0_di_ct_lcmh %>% filter( grepl('^H', mono_tone_1))
f0_di_ct_lcmh_hp <- f0_di_ct_lcmh_h %>% filter(grepl('p$', syntax_iniTone))
f0_di_ct_lcmh_hs <- f0_di_ct_lcmh_h %>% filter(grepl('s$', syntax_iniTone))
f0_di_ct_lcmh_l <- f0_di_ct_lcmh %>% filter( grepl('^[LR]', mono_tone_1)) 
f0_di_ct_lcmh_lp <- f0_di_ct_lcmh_l %>% filter(grepl('p$', syntax_iniTone))
f0_di_ct_lcmh_ls <- f0_di_ct_lcmh_l %>% filter(grepl('s$', syntax_iniTone))

Initial data inspection

# yinping-initial LC & MH
ggplotly(draw_by(f0_di_ct_lcmh_hp, 'speaker'), tooltip = c('text', 'x'))
# yinshang-initial LC & MH
ggplotly(draw_by(f0_di_ct_lcmh_hs, 'speaker'), tooltip = c('text', 'x'))
# yangping-initial LC & MH
ggplotly(draw_by(f0_di_ct_lcmh_lp, 'speaker'), tooltip = c('text', 'x'))
# yangshang-initial LC & MH
ggplotly(draw_by(f0_di_ct_lcmh_ls, 'speaker'), tooltip = c('text', 'x'))

Perceptual analysis

H-register-initial LC & MH: average

f0_di_ct_lcmh_h <- f0_di_ct_lcmh_h %>% 
  mutate(sandhi_tone = ifelse(sandhi_tone == 'HLLM', 'HMML', sandhi_tone),
         propdur = as.integer(normtime)/20) %>% 
  unite('groupvar', ind_no, syllable_no, sep = '_', remove = FALSE) %>% 
  filter(is.na(sandhi_tone) == FALSE) %>% 
  filter(!ind_no %in% c('S2_1_ct', 'S2_11_ct', 'S2_27_ct', 'S3_5_ct', 'S3_19_ct', 'S5_27_ct')) 
  
unique(f0_di_ct_lcmh_h$sandhi_tone) # check the labels
## [1] "HMML" "MHHL" "MMMH" "HHHH"
p_cluster(f0_di_ct_lcmh_h, sandhi_tone)
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.

H-register-initial LC & MH: individual categories

p_cluster(f0_di_ct_lcmh_h, sandhi_tone, 'speaker')
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.

# examine individual cases
f0_di_ct_lcmh_h %>% 
  filter(sandhi_tone == 'HMML' & speaker == 'S2') %>% 
  select('token', 'groupvar') %>% distinct()
## # A tibble: 2 × 2
##   token groupvar  
##   <chr> <chr>     
## 1 新路  S2_51_ct_1
## 2 新路  S2_51_ct_2

H-register-initial LC & MH: average

k-means clustering analysis

H-register LC & MH

f0_di_ct_lcmh_h_kmeans <- f0_di_ct_lcmh_h %>% 
  select(-diortri, -syllable_no, -focus_no, -f0, -groupvar, -propdur) %>% 
  spread(normtime, norm_f0)

f0_di_ct_lcmh_h_cluster <- k_means(f0_di_ct_lcmh_h_kmeans)
kml(f0_di_ct_lcmh_h_cluster, nbClusters = 2:10) 
##  ~ Fast KmL ~
## ***************************************************************************************************S
## 100 ********************************************************************************S
plot(f0_di_ct_lcmh_h_cluster, 4, parTraj=parTRAJ(col="clusters"))

f0_di_ct_lcmh_h_kmeans <- f0_di_ct_lcmh_h_kmeans %>% 
  mutate(cluster4 = getClusters(f0_di_ct_lcmh_h_cluster, 4))

cluster_solution <- get_cluster_solution(f0_di_ct_lcmh_h_kmeans)
compare_cluster(cluster_solution, 'cluster4')
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Scale for fill is already present.
## Adding another scale for fill, which will replace the existing scale.
## Warning: Vectorized input to `element_text()` is not officially supported.
## ℹ Results may be unexpected or may change in future versions of ggplot2.
## Warning: Vectorized input to `element_text()` is not officially supported.
## ℹ Results may be unexpected or may change in future versions of ggplot2.

# examine mismathes
f0_di_ct_lcmh_h_kmeans %>% filter(sandhi_tone == 'HMML' & cluster4 == 'C')
## # A tibble: 13 × 35
##    speaker token focus_condition citation_no ind_no   mono_tone_1 mono_tone_2
##    <fct>   <chr> <chr>           <fct>       <chr>    <chr>       <chr>      
##  1 S1      青椒  ct              1           S1_1_ct  HHp         HHp        
##  2 S1      书包  ct              3           S1_3_ct  HHp         HHp        
##  3 S2      新路  ct              51          S2_51_ct HHp         LHq        
##  4 S5      新车  ct              45          S5_45_ct HHp         HHp        
##  5 S7      樱花  ct              9           S7_9_ct  HHp         HHp        
##  6 S7      新车  ct              45          S7_45_ct HLp         HHp        
##  7 S7      新路  ct              51          S7_51_ct HLp         LHq        
##  8 S8      书包  ct              3           S8_3_ct  HHp         HHp        
##  9 S8      樱花  ct              9           S8_9_ct  HLp         HHp        
## 10 S8      青菜  ct              11          S8_11_ct HLp         HLq        
## 11 S8      樱桃  ct              17          S8_17_ct HLp         RFp        
## 12 S8      书房  ct              21          S8_21_ct HHp         RFp        
## 13 S8      青豆  ct              25          S8_25_ct HLp         LHq        
## # ℹ 28 more variables: mono_tone_3 <chr>, citation_tone <chr>, syntax <chr>,
## #   syntax_iniTone <chr>, sandhi_tone <chr>, sandhi_tone_var <chr>,
## #   f0ref <dbl>, `1` <dbl>, `2` <dbl>, `3` <dbl>, `4` <dbl>, `5` <dbl>,
## #   `6` <dbl>, `7` <dbl>, `8` <dbl>, `9` <dbl>, `10` <dbl>, `11` <dbl>,
## #   `12` <dbl>, `13` <dbl>, `14` <dbl>, `15` <dbl>, `16` <dbl>, `17` <dbl>,
## #   `18` <dbl>, `19` <dbl>, `20` <dbl>, cluster4 <fct>

try doing k-means for the whole disyllabic citation dataset

f0_di_ct_lcmh_kmeans <- f0_di_ct_lcmh %>% 
  filter(!sandhi_tone %in% c(NA, 'outlier')) %>% 
  select(-diortri, -syllable_no, -focus_no, -f0) %>% 
  spread(normtime, norm_f0)

f0_di_ct_lcmh_cluster <- k_means(f0_di_ct_lcmh_kmeans)
kml(f0_di_ct_lcmh_cluster, nbClusters = 2:10) 
##  ~ Fast KmL ~
## ***************************************************************************************************S
## 100 ********************************************************************************S
plot(f0_di_ct_lcmh_cluster, 7, parTraj=parTRAJ(col="clusters"))

f0_di_ct_lcmh_kmeans <- f0_di_ct_lcmh_kmeans %>% 
  mutate(cluster7 = getClusters(f0_di_ct_lcmh_cluster, 7))

Mapping between human and machine clustering

cluster_solution <- get_cluster_solution(f0_di_ct_lcmh_kmeans)
compare_cluster(cluster_solution, 'cluster7')
## Scale for fill is already present.
## Adding another scale for fill, which will replace the existing scale.
## Warning: Vectorized input to `element_text()` is not officially supported.
## ℹ Results may be unexpected or may change in future versions of ggplot2.
## Vectorized input to `element_text()` is not officially supported.
## ℹ Results may be unexpected or may change in future versions of ggplot2.